#Importing the libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import files
files = files.upload()
# Load the dataset into a dataframe
df = pd.read_csv("NYC_listings.csv")
df
<ipython-input-3-a3d3d522f803>:3: DtypeWarning: Columns (68) have mixed types. Specify dtype option on import or set low_memory=False.
df = pd.read_csv("NYC_listings.csv")
| id | listing_url | scrape_id | last_scraped | source | name | description | neighborhood_overview | picture_url | host_id | ... | review_scores_communication | review_scores_location | review_scores_value | license | instant_bookable | calculated_host_listings_count | calculated_host_listings_count_entire_homes | calculated_host_listings_count_private_rooms | calculated_host_listings_count_shared_rooms | reviews_per_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2595 | https://www.airbnb.com/rooms/2595 | 20221204162430 | 2022-12-05 | city scrape | Skylit Midtown Castle | Beautiful, spacious skylit studio in the heart... | Centrally located in the heart of Manhattan ju... | https://a0.muscache.com/pictures/f0813a11-40b2... | 2845 | ... | 4.80 | 4.81 | 4.40 | NaN | f | 3 | 3 | 0 | 0 | 0.31 |
| 1 | 5203 | https://www.airbnb.com/rooms/5203 | 20221204162430 | 2022-12-05 | previous scrape | Cozy Clean Guest Room - Family Apt | Our best guests are seeking a safe, clean, spa... | Our neighborhood is full of restaurants and ca... | https://a0.muscache.com/pictures/103776/b37157... | 7490 | ... | 4.95 | 4.94 | 4.92 | NaN | f | 1 | 0 | 1 | 0 | 0.73 |
| 2 | 5136 | https://www.airbnb.com/rooms/5136 | 20221204162430 | 2022-12-04 | city scrape | Spacious Brooklyn Duplex, Patio + Garden | We welcome you to stay in our lovely 2 br dupl... | NaN | https://a0.muscache.com/pictures/miso/Hosting-... | 7378 | ... | 5.00 | 4.67 | 5.00 | NaN | f | 1 | 1 | 0 | 0 | 0.03 |
| 3 | 5121 | https://www.airbnb.com/rooms/5121 | 20221204162430 | 2022-12-05 | city scrape | BlissArtsSpace! | One room available for rent in a 2 bedroom apt... | NaN | https://a0.muscache.com/pictures/2090980c-b68e... | 7356 | ... | 4.91 | 4.47 | 4.52 | NaN | f | 2 | 0 | 2 | 0 | 0.30 |
| 4 | 6848 | https://www.airbnb.com/rooms/6848 | 20221204162430 | 2022-12-05 | city scrape | Only 2 stops to Manhattan studio | Comfortable studio apartment with super comfor... | NaN | https://a0.muscache.com/pictures/e4f031a7-f146... | 15991 | ... | 4.80 | 4.67 | 4.56 | NaN | f | 1 | 1 | 0 | 0 | 1.13 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 41528 | 772683159414917117 | https://www.airbnb.com/rooms/772683159414917117 | 20221204162430 | 2022-12-05 | city scrape | Dahiari | Desconecta de tus preocupaciones en este espac... | NaN | https://a0.muscache.com/pictures/miso/Hosting-... | 125534010 | ... | NaN | NaN | NaN | NaN | f | 1 | 1 | 0 | 0 | NaN |
| 41529 | 772705452516314073 | https://www.airbnb.com/rooms/772705452516314073 | 20221204162430 | 2022-12-05 | city scrape | Beautiful Basement | Your family will be close to everything when y... | NaN | https://a0.muscache.com/pictures/miso/Hosting-... | 338424773 | ... | NaN | NaN | NaN | NaN | t | 1 | 0 | 1 | 0 | NaN |
| 41530 | 772710779275911753 | https://www.airbnb.com/rooms/772710779275911753 | 20221204162430 | 2022-12-05 | city scrape | Central Park Close By - 24 | This is a Three-Bedroom Apartment. You will ha... | NaN | https://a0.muscache.com/pictures/miso/Hosting-... | 2653479 | ... | NaN | NaN | NaN | NaN | t | 37 | 2 | 35 | 0 | NaN |
| 41531 | 772714221060214808 | https://www.airbnb.com/rooms/772714221060214808 | 20221204162430 | 2022-12-04 | city scrape | Good Vibes at The Bronx | Keep it simple at this peaceful and centrally-... | NaN | https://a0.muscache.com/pictures/miso/Hosting-... | 421264574 | ... | NaN | NaN | NaN | NaN | t | 1 | 1 | 0 | 0 | NaN |
| 41532 | 772716724205003579 | https://www.airbnb.com/rooms/772716724205003579 | 20221204162430 | 2022-12-05 | city scrape | 2 bedroom Condo near West Village | This beautifully decorated condo will give you... | NaN | https://a0.muscache.com/pictures/miso/Hosting-... | 481177884 | ... | NaN | NaN | NaN | NaN | t | 2 | 2 | 0 | 0 | NaN |
41533 rows × 75 columns
df.head(30)
| id | listing_url | scrape_id | last_scraped | source | name | description | neighborhood_overview | picture_url | host_id | ... | review_scores_communication | review_scores_location | review_scores_value | license | instant_bookable | calculated_host_listings_count | calculated_host_listings_count_entire_homes | calculated_host_listings_count_private_rooms | calculated_host_listings_count_shared_rooms | reviews_per_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2595 | https://www.airbnb.com/rooms/2595 | 20221204162430 | 2022-12-05 | city scrape | Skylit Midtown Castle | Beautiful, spacious skylit studio in the heart... | Centrally located in the heart of Manhattan ju... | https://a0.muscache.com/pictures/f0813a11-40b2... | 2845 | ... | 4.80 | 4.81 | 4.40 | NaN | f | 3 | 3 | 0 | 0 | 0.31 |
| 1 | 5203 | https://www.airbnb.com/rooms/5203 | 20221204162430 | 2022-12-05 | previous scrape | Cozy Clean Guest Room - Family Apt | Our best guests are seeking a safe, clean, spa... | Our neighborhood is full of restaurants and ca... | https://a0.muscache.com/pictures/103776/b37157... | 7490 | ... | 4.95 | 4.94 | 4.92 | NaN | f | 1 | 0 | 1 | 0 | 0.73 |
| 2 | 5136 | https://www.airbnb.com/rooms/5136 | 20221204162430 | 2022-12-04 | city scrape | Spacious Brooklyn Duplex, Patio + Garden | We welcome you to stay in our lovely 2 br dupl... | NaN | https://a0.muscache.com/pictures/miso/Hosting-... | 7378 | ... | 5.00 | 4.67 | 5.00 | NaN | f | 1 | 1 | 0 | 0 | 0.03 |
| 3 | 5121 | https://www.airbnb.com/rooms/5121 | 20221204162430 | 2022-12-05 | city scrape | BlissArtsSpace! | One room available for rent in a 2 bedroom apt... | NaN | https://a0.muscache.com/pictures/2090980c-b68e... | 7356 | ... | 4.91 | 4.47 | 4.52 | NaN | f | 2 | 0 | 2 | 0 | 0.30 |
| 4 | 6848 | https://www.airbnb.com/rooms/6848 | 20221204162430 | 2022-12-05 | city scrape | Only 2 stops to Manhattan studio | Comfortable studio apartment with super comfor... | NaN | https://a0.muscache.com/pictures/e4f031a7-f146... | 15991 | ... | 4.80 | 4.67 | 4.56 | NaN | f | 1 | 1 | 0 | 0 | 1.13 |
| 5 | 5178 | https://www.airbnb.com/rooms/5178 | 20221204162430 | 2022-12-05 | city scrape | Large Furnished Room Near B'way | Please don’t expect the luxury here just a bas... | Theater district, many restaurants around here. | https://a0.muscache.com/pictures/12065/f070997... | 8967 | ... | 4.45 | 4.88 | 4.39 | NaN | f | 1 | 0 | 1 | 0 | 3.38 |
| 6 | 6990 | https://www.airbnb.com/rooms/6990 | 20221204162430 | 2022-12-05 | city scrape | UES Beautiful Blue Room | Beautiful peaceful healthy home<br /><br /><b>... | Location: Five minutes to Central Park, Museum... | https://a0.muscache.com/pictures/be6cd5b3-9295... | 16800 | ... | 4.95 | 4.84 | 4.85 | NaN | t | 1 | 0 | 1 | 0 | 1.52 |
| 7 | 6872 | https://www.airbnb.com/rooms/6872 | 20221204162430 | 2022-12-05 | city scrape | Uptown Sanctuary w/ Private Bath (Month to Month) | A charming month-to-month home away from home ... | This sweet Harlem sanctuary is a 10-20 minute ... | https://a0.muscache.com/pictures/miso/Hosting-... | 16104 | ... | 5.00 | 5.00 | 5.00 | NaN | f | 2 | 0 | 2 | 0 | 0.16 |
| 8 | 7097 | https://www.airbnb.com/rooms/7097 | 20221204162430 | 2022-12-04 | city scrape | Perfect for Your Parents: Privacy + Garden | Parents/grandparents coming to town, or just h... | Residential, village-like atmosphere. Lots of ... | https://a0.muscache.com/pictures/miso/Hosting-... | 17571 | ... | 4.92 | 4.94 | 4.81 | NaN | t | 2 | 1 | 1 | 0 | 2.01 |
| 9 | 7064 | https://www.airbnb.com/rooms/7064 | 20221204162430 | 2022-12-05 | city scrape | Amazing location! Wburg. Large, bright & tranquil | Large, private loft-like room in a spacious 2-... | - One stop from the East Village, Lower East S... | https://a0.muscache.com/pictures/13708959/7e74... | 17297 | ... | 5.00 | 5.00 | 5.00 | NaN | f | 2 | 0 | 2 | 0 | 0.09 |
| 10 | 8490 | https://www.airbnb.com/rooms/8490 | 20221204162430 | 2022-12-05 | city scrape | Maison des Sirenes1,bohemian, luminous apartment | <b>The space</b><br />I am the lucky owner of ... | NaN | https://a0.muscache.com/pictures/1c51369e-a251... | 25183 | ... | 4.87 | 4.65 | 4.75 | NaN | f | 2 | 2 | 0 | 0 | 1.01 |
| 11 | 7801 | https://www.airbnb.com/rooms/7801 | 20221204162430 | 2022-12-05 | city scrape | Sweet and Spacious Brooklyn Loft | A true open-plan loft in a repurposed factory ... | We've lived here for over 10 years and watched... | https://a0.muscache.com/pictures/207102/56d6fc... | 21207 | ... | 4.60 | 5.00 | 4.80 | NaN | f | 1 | 1 | 0 | 0 | 0.06 |
| 12 | 9357 | https://www.airbnb.com/rooms/9357 | 20221204162430 | 2022-12-05 | previous scrape | Midtown Pied-a-terre | HELLO. PLEASE DO NOT HIT "REQUEST TO BOOK". H... | Quiet residential block near many restaurants ... | https://a0.muscache.com/pictures/90036/4e60665... | 30193 | ... | 5.00 | 4.95 | 4.58 | NaN | f | 1 | 1 | 0 | 0 | 0.36 |
| 13 | 5803 | https://www.airbnb.com/rooms/5803 | 20221204162430 | 2022-12-04 | city scrape | Lovely Room 1 in BEST AREA; Legal Rental, Spot... | Beautiful house, gorgeous garden, large patio,... | Neighborhood is amazing!<br />Best subways to ... | https://a0.muscache.com/pictures/2884180/f19a1... | 9744 | ... | 4.83 | 4.87 | 4.74 | NaN | f | 3 | 1 | 2 | 0 | 1.31 |
| 14 | 10962 | https://www.airbnb.com/rooms/10962 | 20221204162430 | 2022-12-04 | city scrape | Lovely Room 2 in BEST AREA; Legal Rental, Spot... | Lovely room, gorgeous garden, helpful host in... | Neighborhood is wonderful, a great walking nei... | https://a0.muscache.com/pictures/2885219/f762f... | 9744 | ... | 4.78 | 4.88 | 4.74 | NaN | f | 3 | 1 | 2 | 0 | 1.37 |
| 15 | 9704 | https://www.airbnb.com/rooms/9704 | 20221204162430 | 2022-12-05 | city scrape | Spacious 1 bedroom in luxe building | The room is spacious, the neighborhood is safe... | NaN | https://a0.muscache.com/pictures/38418/569b54f... | 32045 | ... | 4.84 | 4.84 | 4.90 | NaN | f | 1 | 0 | 1 | 0 | 0.95 |
| 16 | 12192 | https://www.airbnb.com/rooms/12192 | 20221204162430 | 2022-12-05 | city scrape | ENJOY Downtown NYC! | Please be vaccinated and responsible if you ar... | Enjoy great food, music, unique shops, night-l... | https://a0.muscache.com/pictures/miso/Hosting-... | 46978 | ... | 4.85 | 4.69 | 4.52 | NaN | f | 2 | 0 | 2 | 0 | 1.82 |
| 17 | 11943 | https://www.airbnb.com/rooms/11943 | 20221204162430 | 2022-12-05 | previous scrape | Country space in the city | <b>The space</b><br />Ditmas Park. Entire 3rd ... | NaN | https://a0.muscache.com/pictures/53007/d30884b... | 45445 | ... | NaN | NaN | NaN | NaN | f | 1 | 0 | 1 | 0 | NaN |
| 18 | 12940 | https://www.airbnb.com/rooms/12940 | 20221204162430 | 2022-12-05 | city scrape | Charming Brownstone 3 - Near PRATT | Super cute 1 bedroom apartment in a 100 year o... | Multicultural melting pot. Lots of cafes, bar... | https://a0.muscache.com/pictures/miso/Hosting-... | 50148 | ... | 4.52 | 4.03 | 4.36 | NaN | f | 1 | 1 | 0 | 0 | 0.46 |
| 19 | 12937 | https://www.airbnb.com/rooms/12937 | 20221204162430 | 2022-12-05 | city scrape | 1 Stop fr. Manhattan! Private Suite,Landmark B... | Private room, dedicated bath and a separate en... | Long Island City is the hottest neighborhood i... | https://a0.muscache.com/pictures/10f2783b-5e8e... | 50124 | ... | 4.91 | 4.90 | 4.86 | NaN | f | 1 | 0 | 1 | 0 | 2.30 |
| 20 | 10452 | https://www.airbnb.com/rooms/10452 | 20221204162430 | 2022-12-05 | city scrape | Large B&B Style rooms | Great location.<br /><br /><b>The space</b><br... | NaN | https://a0.muscache.com/pictures/16336315/c4bf... | 35935 | ... | 4.84 | 4.39 | 4.64 | NaN | f | 5 | 0 | 5 | 0 | 0.53 |
| 21 | 31130 | https://www.airbnb.com/rooms/31130 | 20221204162430 | 2022-12-05 | city scrape | Most Central Location! | The bedroom is set up completely for you. Seco... | Central Park, TimeWarner center on Columbus Ci... | https://a0.muscache.com/pictures/32ad29f2-419b... | 117287 | ... | 4.95 | 5.00 | 4.83 | NaN | f | 4 | 2 | 2 | 0 | 0.45 |
| 22 | 13808 | https://www.airbnb.com/rooms/13808 | 20221204162430 | 2022-12-05 | city scrape | Blue Room for 2 in Brownstone for $1350 monthly | Romantic quiet room in a beautiful 1800 Libert... | We are in New York! And Brooklyn is the new hi... | https://a0.muscache.com/pictures/81099/72ccf5f... | 54275 | ... | 4.86 | 4.54 | 4.71 | NaN | f | 4 | 0 | 4 | 0 | 1.18 |
| 23 | 14290 | https://www.airbnb.com/rooms/14290 | 20221204162430 | 2022-12-05 | city scrape | * ORIGINAL BROOKLYN LOFT * | Original factory building loft, lots of natur... | Bushwick is a constantly changing area, new o... | https://a0.muscache.com/pictures/448859/dbf8f1... | 56104 | ... | 4.54 | 4.74 | 4.68 | NaN | f | 1 | 1 | 0 | 0 | 0.97 |
| 24 | 31555 | https://www.airbnb.com/rooms/31555 | 20221204162430 | 2022-12-05 | previous scrape | Luminous Beautiful West Village Studio | Wonderfully bright, nicely furnished 400-squar... | Tree-lined streets, buzzing bar and restaurant... | https://a0.muscache.com/pictures/70290811/e989... | 135619 | ... | 4.97 | 4.93 | 4.69 | NaN | f | 1 | 1 | 0 | 0 | 0.22 |
| 25 | 29683 | https://www.airbnb.com/rooms/29683 | 20221204162430 | 2022-12-05 | city scrape | Stylish Apartment with office space Near SoHo! | Modern 1 bedroom apartment with stylish Scandi... | NoHo is perfectly situated downtown Manhattan.... | https://a0.muscache.com/pictures/d670b401-5ce5... | 125857 | ... | 4.92 | 4.90 | 4.63 | NaN | f | 2 | 1 | 1 | 0 | 0.77 |
| 26 | 31902 | https://www.airbnb.com/rooms/31902 | 20221204162430 | 2022-12-05 | previous scrape | Sanctuary in East Flatbush | Come and Stay in a warm and nurturing environm... | I love the sweetness and tranquility. I love ... | https://a0.muscache.com/pictures/74729149/1220... | 137292 | ... | 4.33 | 4.33 | 4.33 | NaN | f | 1 | 0 | 1 | 0 | 0.03 |
| 27 | 61509 | https://www.airbnb.com/rooms/61509 | 20221204162430 | 2022-12-05 | previous scrape | Quiet, clean midtown apt w. elevato | This apartment is available until July 30th. S... | It is located steps away from Grand central, m... | https://a0.muscache.com/pictures/12284324/9fbf... | 23619 | ... | 4.52 | 4.84 | 4.41 | NaN | f | 1 | 1 | 0 | 0 | 0.64 |
| 28 | 62427 | https://www.airbnb.com/rooms/62427 | 20221204162430 | 2022-12-05 | previous scrape | Great East Village Apartment Rental | Be in the heart of the best neighborhood in NY... | NaN | https://a0.muscache.com/pictures/381971/8d6dd5... | 303882 | ... | 4.97 | 4.98 | 4.79 | NaN | f | 1 | 1 | 0 | 0 | 0.45 |
| 29 | 14314 | https://www.airbnb.com/rooms/14314 | 20221204162430 | 2022-12-05 | city scrape | Greenpoint Place...Has It All! | Cozy, comfortable, one bedroom apartment on gr... | NaN | https://a0.muscache.com/pictures/67332445/1478... | 56246 | ... | 4.93 | 4.80 | 4.78 | NaN | f | 1 | 1 | 0 | 0 | 1.13 |
30 rows × 75 columns
plt.figure(figsize=(20,10))
sns.heatmap(df.isna().transpose(),
cmap="YlGnBu",
cbar_kws={'label': 'Missing Data'})
plt.savefig("visualizing_missing_data_with_heatmap_Seaborn_Python.png", dpi=100)
df.shape
(41533, 75)
# Missing value count for each column
df_missing = df.isna().sum()
df_missing = df_missing.sort_values(ascending = False)
print(df_missing.to_markdown())
| | 0 | |:---------------------------------------------|------:| | bathrooms | 41533 | | calendar_updated | 41533 | | license | 41532 | | host_about | 18312 | | neighborhood_overview | 17444 | | neighbourhood | 17443 | | host_response_time | 13645 | | host_response_rate | 13645 | | host_acceptance_rate | 12211 | | review_scores_value | 9848 | | review_scores_location | 9848 | | review_scores_checkin | 9845 | | review_scores_accuracy | 9841 | | review_scores_communication | 9836 | | review_scores_cleanliness | 9831 | | reviews_per_month | 9393 | | first_review | 9393 | | last_review | 9393 | | review_scores_rating | 9393 | | host_neighbourhood | 8189 | | host_location | 7745 | | bedrooms | 3822 | | beds | 941 | | description | 786 | | bathrooms_text | 77 | | host_is_superhost | 29 | | maximum_nights_avg_ntm | 14 | | minimum_nights_avg_ntm | 14 | | maximum_maximum_nights | 14 | | minimum_maximum_nights | 14 | | maximum_minimum_nights | 14 | | minimum_minimum_nights | 14 | | name | 13 | | host_name | 5 | | host_since | 5 | | host_total_listings_count | 5 | | host_listings_count | 5 | | host_picture_url | 5 | | host_identity_verified | 5 | | host_has_profile_pic | 5 | | host_thumbnail_url | 5 | | listing_url | 0 | | host_verifications | 0 | | number_of_reviews_l30d | 0 | | host_url | 0 | | host_id | 0 | | picture_url | 0 | | source | 0 | | last_scraped | 0 | | scrape_id | 0 | | instant_bookable | 0 | | calculated_host_listings_count | 0 | | calculated_host_listings_count_entire_homes | 0 | | calculated_host_listings_count_private_rooms | 0 | | calculated_host_listings_count_shared_rooms | 0 | | number_of_reviews_ltm | 0 | | number_of_reviews | 0 | | calendar_last_scraped | 0 | | availability_365 | 0 | | amenities | 0 | | price | 0 | | minimum_nights | 0 | | maximum_nights | 0 | | accommodates | 0 | | room_type | 0 | | property_type | 0 | | longitude | 0 | | latitude | 0 | | neighbourhood_group_cleansed | 0 | | neighbourhood_cleansed | 0 | | has_availability | 0 | | availability_30 | 0 | | availability_60 | 0 | | availability_90 | 0 | | id | 0 |
# Missing value percentage count for each column
missing_values = df.isnull().mean()*100
missing_values = missing_values.sort_values(ascending = False)
print(missing_values.to_markdown())
| | 0 | |:---------------------------------------------|------------:| | bathrooms | 100 | | calendar_updated | 100 | | license | 99.9976 | | host_about | 44.0902 | | neighborhood_overview | 42.0003 | | neighbourhood | 41.9979 | | host_response_time | 32.8534 | | host_response_rate | 32.8534 | | host_acceptance_rate | 29.4007 | | review_scores_value | 23.7113 | | review_scores_location | 23.7113 | | review_scores_checkin | 23.704 | | review_scores_accuracy | 23.6944 | | review_scores_communication | 23.6824 | | review_scores_cleanliness | 23.6703 | | reviews_per_month | 22.6158 | | first_review | 22.6158 | | last_review | 22.6158 | | review_scores_rating | 22.6158 | | host_neighbourhood | 19.7169 | | host_location | 18.6478 | | bedrooms | 9.20232 | | beds | 2.26567 | | description | 1.89247 | | bathrooms_text | 0.185395 | | host_is_superhost | 0.069824 | | maximum_nights_avg_ntm | 0.0337081 | | minimum_nights_avg_ntm | 0.0337081 | | maximum_maximum_nights | 0.0337081 | | minimum_maximum_nights | 0.0337081 | | maximum_minimum_nights | 0.0337081 | | minimum_minimum_nights | 0.0337081 | | name | 0.0313004 | | host_name | 0.0120386 | | host_since | 0.0120386 | | host_total_listings_count | 0.0120386 | | host_listings_count | 0.0120386 | | host_picture_url | 0.0120386 | | host_identity_verified | 0.0120386 | | host_has_profile_pic | 0.0120386 | | host_thumbnail_url | 0.0120386 | | listing_url | 0 | | host_verifications | 0 | | number_of_reviews_l30d | 0 | | host_url | 0 | | host_id | 0 | | picture_url | 0 | | source | 0 | | last_scraped | 0 | | scrape_id | 0 | | instant_bookable | 0 | | calculated_host_listings_count | 0 | | calculated_host_listings_count_entire_homes | 0 | | calculated_host_listings_count_private_rooms | 0 | | calculated_host_listings_count_shared_rooms | 0 | | number_of_reviews_ltm | 0 | | number_of_reviews | 0 | | calendar_last_scraped | 0 | | availability_365 | 0 | | amenities | 0 | | price | 0 | | minimum_nights | 0 | | maximum_nights | 0 | | accommodates | 0 | | room_type | 0 | | property_type | 0 | | longitude | 0 | | latitude | 0 | | neighbourhood_group_cleansed | 0 | | neighbourhood_cleansed | 0 | | has_availability | 0 | | availability_30 | 0 | | availability_60 | 0 | | availability_90 | 0 | | id | 0 |
df[['maximum_nights_avg_ntm','minimum_nights_avg_ntm','maximum_maximum_nights','minimum_maximum_nights','maximum_minimum_nights','minimum_minimum_nights','minimum_nights','maximum_nights']].head(10)
| maximum_nights_avg_ntm | minimum_nights_avg_ntm | maximum_maximum_nights | minimum_maximum_nights | maximum_minimum_nights | minimum_minimum_nights | minimum_nights | maximum_nights | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1125.0 | 30.0 | 1125.0 | 1125.0 | 30.0 | 30.0 | 30 | 1125 |
| 1 | 14.0 | 2.0 | 14.0 | 14.0 | 2.0 | 2.0 | 2 | 14 |
| 2 | 1125.0 | 21.0 | 1125.0 | 1125.0 | 21.0 | 21.0 | 21 | 1125 |
| 3 | 730.0 | 30.0 | 730.0 | 730.0 | 30.0 | 30.0 | 30 | 730 |
| 4 | 1125.0 | 30.0 | 1125.0 | 1125.0 | 30.0 | 30.0 | 30 | 730 |
| 5 | 14.0 | 2.0 | 14.0 | 14.0 | 2.0 | 2.0 | 2 | 14 |
| 6 | 1125.0 | 30.0 | 1125.0 | 1125.0 | 30.0 | 30.0 | 30 | 700 |
| 7 | 180.0 | 30.0 | 180.0 | 180.0 | 30.0 | 30.0 | 30 | 180 |
| 8 | 1125.0 | 3.0 | 1125.0 | 1125.0 | 3.0 | 3.0 | 3 | 1125 |
| 9 | 45.0 | 7.0 | 45.0 | 45.0 | 7.0 | 7.0 | 7 | 45 |
df[['bathrooms','bathrooms_text']].head(10)
| bathrooms | bathrooms_text | |
|---|---|---|
| 0 | NaN | 1 bath |
| 1 | NaN | 1 shared bath |
| 2 | NaN | 1.5 baths |
| 3 | NaN | NaN |
| 4 | NaN | 1 bath |
| 5 | NaN | 1 bath |
| 6 | NaN | 1 shared bath |
| 7 | NaN | 1 shared bath |
| 8 | NaN | 1 bath |
| 9 | NaN | 1 shared bath |
df[['host_total_listings_count','host_listings_count']].head(10)
| host_total_listings_count | host_listings_count | |
|---|---|---|
| 0 | 9.0 | 6.0 |
| 1 | 5.0 | 1.0 |
| 2 | 5.0 | 1.0 |
| 3 | 2.0 | 2.0 |
| 4 | 1.0 | 1.0 |
| 5 | 1.0 | 1.0 |
| 6 | 4.0 | 1.0 |
| 7 | 2.0 | 2.0 |
| 8 | 2.0 | 2.0 |
| 9 | 2.0 | 2.0 |
All the columns with URL host_picture_url,host_has_profile_pic,host_url,picture_url,host_thumbnail_url,listing_url because we wont need it.
bathrooms and bathrooms_text is one and the same and lets use only one.
host_total_listings_count and host_listings_count is almost unique, so lets drop one.
Few of the unnecessary columns can also be dropped such as : scrape_id, last_scraped,host_verifications, host_identity_verified.
maximum_nights_avg_ntm,minimum_nights_avg_ntm,maximum_maximum_nights,minimum_maximum_nights,maximum_minimum_nights,minimum_minimum_nights,minimum_nights,maximum_nights.
It doesnt makes sense to keep all of this. Hence we will be keeping minimum_nights and maximum_nights only.
drop_cols = ['host_picture_url','host_has_profile_pic','host_url','picture_url','host_thumbnail_url','listing_url','bathrooms','host_total_listings_count',
'scrape_id','last_scraped','host_verifications','host_identity_verified','maximum_nights_avg_ntm','minimum_nights_avg_ntm',
'maximum_maximum_nights','minimum_maximum_nights','maximum_minimum_nights','minimum_minimum_nights']
df_airbnb = df.drop(drop_cols, axis=1)
df_airbnb.shape
(41533, 57)
df_airbnb.shape
(41533, 57)
df_airbnb
| id | source | name | description | neighborhood_overview | host_id | host_name | host_since | host_location | host_about | ... | review_scores_communication | review_scores_location | review_scores_value | license | instant_bookable | calculated_host_listings_count | calculated_host_listings_count_entire_homes | calculated_host_listings_count_private_rooms | calculated_host_listings_count_shared_rooms | reviews_per_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2595 | city scrape | Skylit Midtown Castle | Beautiful, spacious skylit studio in the heart... | Centrally located in the heart of Manhattan ju... | 2845 | Jennifer | 2008-09-09 | New York, NY | A New Yorker since (Phone number hidden by Air... | ... | 4.80 | 4.81 | 4.40 | NaN | f | 3 | 3 | 0 | 0 | 0.31 |
| 1 | 5203 | previous scrape | Cozy Clean Guest Room - Family Apt | Our best guests are seeking a safe, clean, spa... | Our neighborhood is full of restaurants and ca... | 7490 | MaryEllen | 2009-02-05 | New York, NY | Welcome to family life with my oldest two away... | ... | 4.95 | 4.94 | 4.92 | NaN | f | 1 | 0 | 1 | 0 | 0.73 |
| 2 | 5136 | city scrape | Spacious Brooklyn Duplex, Patio + Garden | We welcome you to stay in our lovely 2 br dupl... | NaN | 7378 | Rebecca | 2009-02-03 | New York, NY | Rebecca is an artist/designer, and Henoch is i... | ... | 5.00 | 4.67 | 5.00 | NaN | f | 1 | 1 | 0 | 0 | 0.03 |
| 3 | 5121 | city scrape | BlissArtsSpace! | One room available for rent in a 2 bedroom apt... | NaN | 7356 | Garon | 2009-02-03 | New York, NY | I am an artist(painter, filmmaker) and curato... | ... | 4.91 | 4.47 | 4.52 | NaN | f | 2 | 0 | 2 | 0 | 0.30 |
| 4 | 6848 | city scrape | Only 2 stops to Manhattan studio | Comfortable studio apartment with super comfor... | NaN | 15991 | Allen & Irina | 2009-05-06 | New York, NY | We love to travel. When we travel we like to s... | ... | 4.80 | 4.67 | 4.56 | NaN | f | 1 | 1 | 0 | 0 | 1.13 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 41528 | 772683159414917117 | city scrape | Dahiari | Desconecta de tus preocupaciones en este espac... | NaN | 125534010 | Larissa | 2017-04-12 | Dominican Republic | NaN | ... | NaN | NaN | NaN | NaN | f | 1 | 1 | 0 | 0 | NaN |
| 41529 | 772705452516314073 | city scrape | Beautiful Basement | Your family will be close to everything when y... | NaN | 338424773 | Md | 2020-02-24 | NaN | NaN | ... | NaN | NaN | NaN | NaN | t | 1 | 0 | 1 | 0 | NaN |
| 41530 | 772710779275911753 | city scrape | Central Park Close By - 24 | This is a Three-Bedroom Apartment. You will ha... | NaN | 2653479 | Richard | 2012-06-16 | New York, NY | I love to travel and meet people. | ... | NaN | NaN | NaN | NaN | t | 37 | 2 | 35 | 0 | NaN |
| 41531 | 772714221060214808 | city scrape | Good Vibes at The Bronx | Keep it simple at this peaceful and centrally-... | NaN | 421264574 | Aridio | 2021-09-02 | NaN | NaN | ... | NaN | NaN | NaN | NaN | t | 1 | 1 | 0 | 0 | NaN |
| 41532 | 772716724205003579 | city scrape | 2 bedroom Condo near West Village | This beautifully decorated condo will give you... | NaN | 481177884 | Steven | 2022-09-26 | NaN | NaN | ... | NaN | NaN | NaN | NaN | t | 2 | 2 | 0 | 0 | NaN |
41533 rows × 57 columns
df_airbnb.columns
Index(['id', 'source', 'name', 'description', 'neighborhood_overview',
'host_id', 'host_name', 'host_since', 'host_location', 'host_about',
'host_response_time', 'host_response_rate', 'host_acceptance_rate',
'host_is_superhost', 'host_neighbourhood', 'host_listings_count',
'neighbourhood', 'neighbourhood_cleansed',
'neighbourhood_group_cleansed', 'latitude', 'longitude',
'property_type', 'room_type', 'accommodates', 'bathrooms_text',
'bedrooms', 'beds', 'amenities', 'price', 'minimum_nights',
'maximum_nights', 'calendar_updated', 'has_availability',
'availability_30', 'availability_60', 'availability_90',
'availability_365', 'calendar_last_scraped', 'number_of_reviews',
'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review',
'last_review', 'review_scores_rating', 'review_scores_accuracy',
'review_scores_cleanliness', 'review_scores_checkin',
'review_scores_communication', 'review_scores_location',
'review_scores_value', 'license', 'instant_bookable',
'calculated_host_listings_count',
'calculated_host_listings_count_entire_homes',
'calculated_host_listings_count_private_rooms',
'calculated_host_listings_count_shared_rooms', 'reviews_per_month'],
dtype='object')
missing_values = df_airbnb.isnull().mean()*100
missing_values = missing_values.sort_values(ascending = False)
print(missing_values.to_markdown())
| | 0 | |:---------------------------------------------|------------:| | calendar_updated | 100 | | license | 99.9976 | | host_about | 44.0902 | | neighborhood_overview | 42.0003 | | neighbourhood | 41.9979 | | host_response_time | 32.8534 | | host_response_rate | 32.8534 | | host_acceptance_rate | 29.4007 | | review_scores_location | 23.7113 | | review_scores_value | 23.7113 | | review_scores_checkin | 23.704 | | review_scores_accuracy | 23.6944 | | review_scores_communication | 23.6824 | | review_scores_cleanliness | 23.6703 | | first_review | 22.6158 | | review_scores_rating | 22.6158 | | last_review | 22.6158 | | reviews_per_month | 22.6158 | | host_neighbourhood | 19.7169 | | host_location | 18.6478 | | bedrooms | 9.20232 | | beds | 2.26567 | | description | 1.89247 | | bathrooms_text | 0.185395 | | host_is_superhost | 0.069824 | | name | 0.0313004 | | host_since | 0.0120386 | | host_name | 0.0120386 | | host_listings_count | 0.0120386 | | number_of_reviews_l30d | 0 | | calculated_host_listings_count_entire_homes | 0 | | instant_bookable | 0 | | calculated_host_listings_count | 0 | | number_of_reviews | 0 | | calculated_host_listings_count_private_rooms | 0 | | calculated_host_listings_count_shared_rooms | 0 | | number_of_reviews_ltm | 0 | | id | 0 | | calendar_last_scraped | 0 | | accommodates | 0 | | host_id | 0 | | neighbourhood_cleansed | 0 | | neighbourhood_group_cleansed | 0 | | latitude | 0 | | longitude | 0 | | property_type | 0 | | room_type | 0 | | amenities | 0 | | availability_365 | 0 | | source | 0 | | minimum_nights | 0 | | maximum_nights | 0 | | has_availability | 0 | | availability_30 | 0 | | availability_60 | 0 | | availability_90 | 0 | | price | 0 |
drop_cols = ['calendar_updated', 'license']
df_airbnb = df_airbnb.drop(drop_cols, axis=1)
df_airbnb.shape
(41533, 55)
missing_values = df_airbnb.isnull().mean()*100
missing_values = missing_values.sort_values(ascending = False)
print(missing_values.to_markdown())
| | 0 | |:---------------------------------------------|-----------:| | host_about | 44.0902 | | neighborhood_overview | 42.0003 | | neighbourhood | 41.9979 | | host_response_time | 32.8534 | | host_response_rate | 32.8534 | | host_acceptance_rate | 29.4007 | | review_scores_location | 23.7113 | | review_scores_value | 23.7113 | | review_scores_checkin | 23.704 | | review_scores_accuracy | 23.6944 | | review_scores_communication | 23.6824 | | review_scores_cleanliness | 23.6703 | | review_scores_rating | 22.6158 | | last_review | 22.6158 | | first_review | 22.6158 | | reviews_per_month | 22.6158 | | host_neighbourhood | 19.7169 | | host_location | 18.6478 | | bedrooms | 9.20232 | | beds | 2.26567 | | description | 1.89247 | | bathrooms_text | 0.185395 | | host_is_superhost | 0.069824 | | name | 0.0313004 | | host_listings_count | 0.0120386 | | host_since | 0.0120386 | | host_name | 0.0120386 | | number_of_reviews_l30d | 0 | | number_of_reviews_ltm | 0 | | calculated_host_listings_count_entire_homes | 0 | | instant_bookable | 0 | | calculated_host_listings_count | 0 | | calendar_last_scraped | 0 | | calculated_host_listings_count_private_rooms | 0 | | calculated_host_listings_count_shared_rooms | 0 | | number_of_reviews | 0 | | id | 0 | | availability_365 | 0 | | availability_90 | 0 | | host_id | 0 | | neighbourhood_cleansed | 0 | | neighbourhood_group_cleansed | 0 | | latitude | 0 | | longitude | 0 | | property_type | 0 | | room_type | 0 | | accommodates | 0 | | source | 0 | | price | 0 | | minimum_nights | 0 | | maximum_nights | 0 | | has_availability | 0 | | availability_30 | 0 | | availability_60 | 0 | | amenities | 0 |
df_airbnb.shape
(41533, 55)
df_airbnb[['host_location', 'host_neighbourhood', 'neighbourhood', 'neighbourhood_cleansed',
'neighbourhood_group_cleansed']].head(10)
| host_location | host_neighbourhood | neighbourhood | neighbourhood_cleansed | neighbourhood_group_cleansed | |
|---|---|---|---|---|---|
| 0 | New York, NY | Midtown | New York, United States | Midtown | Manhattan |
| 1 | New York, NY | Upper West Side | New York, United States | Upper West Side | Manhattan |
| 2 | New York, NY | Greenwood Heights | NaN | Sunset Park | Brooklyn |
| 3 | New York, NY | Bedford-Stuyvesant | NaN | Bedford-Stuyvesant | Brooklyn |
| 4 | New York, NY | Williamsburg | NaN | Williamsburg | Brooklyn |
| 5 | New York, NY | Hell's Kitchen | New York, United States | Midtown | Manhattan |
| 6 | New York, NY | East Harlem | New York, United States | East Harlem | Manhattan |
| 7 | New York, NY | East Harlem | New York, United States | East Harlem | Manhattan |
| 8 | New York, NY | Fort Greene | Brooklyn, New York, United States | Fort Greene | Brooklyn |
| 9 | New York, NY | Williamsburg | Brooklyn, New York, United States | Williamsburg | Brooklyn |
drop_cols = ['host_neighbourhood']
df_airbnb = df_airbnb.drop(drop_cols, axis=1)
df_airbnb.shape
(41533, 54)
df_airbnb[['calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count']]
| calculated_host_listings_count_private_rooms | calculated_host_listings_count_shared_rooms | calculated_host_listings_count_entire_homes | calculated_host_listings_count | |
|---|---|---|---|---|
| 0 | 0 | 0 | 3 | 3 |
| 1 | 1 | 0 | 0 | 1 |
| 2 | 0 | 0 | 1 | 1 |
| 3 | 2 | 0 | 0 | 2 |
| 4 | 0 | 0 | 1 | 1 |
| ... | ... | ... | ... | ... |
| 41528 | 0 | 0 | 1 | 1 |
| 41529 | 1 | 0 | 0 | 1 |
| 41530 | 35 | 0 | 2 | 37 |
| 41531 | 0 | 0 | 1 | 1 |
| 41532 | 0 | 0 | 2 | 2 |
41533 rows × 4 columns
df_airbnb.calculated_host_listings_count_shared_rooms.value_counts()
0 40706 1 464 2 112 3 111 15 32 8 24 5 24 4 24 10 16 9 13 7 7 Name: calculated_host_listings_count_shared_rooms, dtype: int64
calculated_host_listings_count_shared_rooms seems to have a lot zeroes which in case means the host never listed a shared room.
We will keep that for now it might help
df_airbnb[['host_response_rate', 'host_acceptance_rate']]
| host_response_rate | host_acceptance_rate | |
|---|---|---|
| 0 | 72% | 22% |
| 1 | NaN | NaN |
| 2 | NaN | 50% |
| 3 | 90% | 82% |
| 4 | 100% | 100% |
| ... | ... | ... |
| 41528 | 100% | 100% |
| 41529 | 100% | 100% |
| 41530 | 98% | 86% |
| 41531 | NaN | NaN |
| 41532 | 83% | 100% |
41533 rows × 2 columns
df_airbnb[['price']]
| price | |
|---|---|
| 0 | $175.00 |
| 1 | $75.00 |
| 2 | $275.00 |
| 3 | $60.00 |
| 4 | $68.00 |
| ... | ... |
| 41528 | $105.00 |
| 41529 | $87.00 |
| 41530 | $70.00 |
| 41531 | $125.00 |
| 41532 | $1,114.00 |
41533 rows × 1 columns
'first_review', 'last_review', 'host_since'
df_airbnb['first_review'] = pd.to_datetime(df_airbnb['first_review'])
df_airbnb['last_review'] = pd.to_datetime(df_airbnb['last_review'])
df_airbnb['host_since'] = pd.to_datetime(df_airbnb['host_since'])
df_airbnb[['first_review', 'last_review', 'host_since']]
| first_review | last_review | host_since | |
|---|---|---|---|
| 0 | 2009-11-21 | 2022-06-21 | 2008-09-09 |
| 1 | 2009-09-07 | 2017-07-21 | 2009-02-05 |
| 2 | 2014-01-02 | 2022-08-10 | 2009-02-03 |
| 3 | 2009-05-28 | 2019-12-02 | 2009-02-03 |
| 4 | 2009-05-25 | 2022-11-02 | 2009-05-06 |
| ... | ... | ... | ... |
| 41528 | NaT | NaT | 2017-04-12 |
| 41529 | NaT | NaT | 2020-02-24 |
| 41530 | NaT | NaT | 2012-06-16 |
| 41531 | NaT | NaT | 2021-09-02 |
| 41532 | NaT | NaT | 2022-09-26 |
41533 rows × 3 columns
def remove_signs(X):
# list of columns that we need to run strip function on
cols1 = ['host_response_rate', 'host_acceptance_rate']
cols2 = ['price']
# iterating over all the columns in the list
for col in cols1:
X[col] = X[col].str.strip('%')
for col in cols2:
X[col] = X[col].str.strip('$')
for col in cols2:
X[col] = X[col].str.replace(',','')
# list of datetime columns that we will change to weeks for feature engineering in the next step
cols3 = ['first_review', 'last_review', 'host_since']
# iterating over all the columns in the list
for col in cols3:
X[col] = X[col].dt.week
# change dtype of 'price'z to float
for col in cols2:
X[col] = X[col].astype(float)
return X
# passing our dataframe as the argument
df_airbnb = remove_signs(df_airbnb)
<ipython-input-30-8139a14801d6>:17: FutureWarning: Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead. X[col] = X[col].dt.week <ipython-input-30-8139a14801d6>:17: FutureWarning: Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead. X[col] = X[col].dt.week <ipython-input-30-8139a14801d6>:17: FutureWarning: Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead. X[col] = X[col].dt.week
df_airbnb
| id | source | name | description | neighborhood_overview | host_id | host_name | host_since | host_location | host_about | ... | review_scores_checkin | review_scores_communication | review_scores_location | review_scores_value | instant_bookable | calculated_host_listings_count | calculated_host_listings_count_entire_homes | calculated_host_listings_count_private_rooms | calculated_host_listings_count_shared_rooms | reviews_per_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2595 | city scrape | Skylit Midtown Castle | Beautiful, spacious skylit studio in the heart... | Centrally located in the heart of Manhattan ju... | 2845 | Jennifer | 37.0 | New York, NY | A New Yorker since (Phone number hidden by Air... | ... | 4.77 | 4.80 | 4.81 | 4.40 | f | 3 | 3 | 0 | 0 | 0.31 |
| 1 | 5203 | previous scrape | Cozy Clean Guest Room - Family Apt | Our best guests are seeking a safe, clean, spa... | Our neighborhood is full of restaurants and ca... | 7490 | MaryEllen | 6.0 | New York, NY | Welcome to family life with my oldest two away... | ... | 4.97 | 4.95 | 4.94 | 4.92 | f | 1 | 0 | 1 | 0 | 0.73 |
| 2 | 5136 | city scrape | Spacious Brooklyn Duplex, Patio + Garden | We welcome you to stay in our lovely 2 br dupl... | NaN | 7378 | Rebecca | 6.0 | New York, NY | Rebecca is an artist/designer, and Henoch is i... | ... | 5.00 | 5.00 | 4.67 | 5.00 | f | 1 | 1 | 0 | 0 | 0.03 |
| 3 | 5121 | city scrape | BlissArtsSpace! | One room available for rent in a 2 bedroom apt... | NaN | 7356 | Garon | 6.0 | New York, NY | I am an artist(painter, filmmaker) and curato... | ... | 4.91 | 4.91 | 4.47 | 4.52 | f | 2 | 0 | 2 | 0 | 0.30 |
| 4 | 6848 | city scrape | Only 2 stops to Manhattan studio | Comfortable studio apartment with super comfor... | NaN | 15991 | Allen & Irina | 19.0 | New York, NY | We love to travel. When we travel we like to s... | ... | 4.84 | 4.80 | 4.67 | 4.56 | f | 1 | 1 | 0 | 0 | 1.13 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 41528 | 772683159414917117 | city scrape | Dahiari | Desconecta de tus preocupaciones en este espac... | NaN | 125534010 | Larissa | 15.0 | Dominican Republic | NaN | ... | NaN | NaN | NaN | NaN | f | 1 | 1 | 0 | 0 | NaN |
| 41529 | 772705452516314073 | city scrape | Beautiful Basement | Your family will be close to everything when y... | NaN | 338424773 | Md | 9.0 | NaN | NaN | ... | NaN | NaN | NaN | NaN | t | 1 | 0 | 1 | 0 | NaN |
| 41530 | 772710779275911753 | city scrape | Central Park Close By - 24 | This is a Three-Bedroom Apartment. You will ha... | NaN | 2653479 | Richard | 24.0 | New York, NY | I love to travel and meet people. | ... | NaN | NaN | NaN | NaN | t | 37 | 2 | 35 | 0 | NaN |
| 41531 | 772714221060214808 | city scrape | Good Vibes at The Bronx | Keep it simple at this peaceful and centrally-... | NaN | 421264574 | Aridio | 35.0 | NaN | NaN | ... | NaN | NaN | NaN | NaN | t | 1 | 1 | 0 | 0 | NaN |
| 41532 | 772716724205003579 | city scrape | 2 bedroom Condo near West Village | This beautifully decorated condo will give you... | NaN | 481177884 | Steven | 39.0 | NaN | NaN | ... | NaN | NaN | NaN | NaN | t | 2 | 2 | 0 | 0 | NaN |
41533 rows × 54 columns
df_airbnb[['host_response_rate', 'host_acceptance_rate']]
| host_response_rate | host_acceptance_rate | |
|---|---|---|
| 0 | 72 | 22 |
| 1 | NaN | NaN |
| 2 | NaN | 50 |
| 3 | 90 | 82 |
| 4 | 100 | 100 |
| ... | ... | ... |
| 41528 | 100 | 100 |
| 41529 | 100 | 100 |
| 41530 | 98 | 86 |
| 41531 | NaN | NaN |
| 41532 | 83 | 100 |
41533 rows × 2 columns
df_airbnb[['price']]
| price | |
|---|---|
| 0 | 175.0 |
| 1 | 75.0 |
| 2 | 275.0 |
| 3 | 60.0 |
| 4 | 68.0 |
| ... | ... |
| 41528 | 105.0 |
| 41529 | 87.0 |
| 41530 | 70.0 |
| 41531 | 125.0 |
| 41532 | 1114.0 |
41533 rows × 1 columns
# Checking null values count columnwise
total_nan = df_airbnb.isna().sum().sort_values(ascending=False)
percentage_nan = (total_nan / df_airbnb.shape[0]) * 100
tabel = pd.concat([total_nan, percentage_nan], axis=1, keys=['Total NaN values', 'Percentage of NaN values'])
tabel
| Total NaN values | Percentage of NaN values | |
|---|---|---|
| host_about | 18312 | 44.090241 |
| neighborhood_overview | 17444 | 42.000337 |
| neighbourhood | 17443 | 41.997929 |
| host_response_time | 13645 | 32.853394 |
| host_response_rate | 13645 | 32.853394 |
| host_acceptance_rate | 12211 | 29.400718 |
| review_scores_location | 9848 | 23.711266 |
| review_scores_value | 9848 | 23.711266 |
| review_scores_checkin | 9845 | 23.704043 |
| review_scores_accuracy | 9841 | 23.694412 |
| review_scores_communication | 9836 | 23.682373 |
| review_scores_cleanliness | 9831 | 23.670334 |
| review_scores_rating | 9393 | 22.615751 |
| last_review | 9393 | 22.615751 |
| first_review | 9393 | 22.615751 |
| reviews_per_month | 9393 | 22.615751 |
| host_location | 7745 | 18.647822 |
| bedrooms | 3822 | 9.202321 |
| beds | 941 | 2.265668 |
| description | 786 | 1.892471 |
| bathrooms_text | 77 | 0.185395 |
| host_is_superhost | 29 | 0.069824 |
| name | 13 | 0.031300 |
| host_since | 5 | 0.012039 |
| host_name | 5 | 0.012039 |
| host_listings_count | 5 | 0.012039 |
| number_of_reviews_l30d | 0 | 0.000000 |
| number_of_reviews_ltm | 0 | 0.000000 |
| calculated_host_listings_count_entire_homes | 0 | 0.000000 |
| instant_bookable | 0 | 0.000000 |
| calculated_host_listings_count | 0 | 0.000000 |
| calendar_last_scraped | 0 | 0.000000 |
| calculated_host_listings_count_private_rooms | 0 | 0.000000 |
| calculated_host_listings_count_shared_rooms | 0 | 0.000000 |
| number_of_reviews | 0 | 0.000000 |
| id | 0 | 0.000000 |
| availability_365 | 0 | 0.000000 |
| availability_90 | 0 | 0.000000 |
| host_id | 0 | 0.000000 |
| neighbourhood_cleansed | 0 | 0.000000 |
| neighbourhood_group_cleansed | 0 | 0.000000 |
| latitude | 0 | 0.000000 |
| longitude | 0 | 0.000000 |
| property_type | 0 | 0.000000 |
| room_type | 0 | 0.000000 |
| accommodates | 0 | 0.000000 |
| amenities | 0 | 0.000000 |
| source | 0 | 0.000000 |
| minimum_nights | 0 | 0.000000 |
| maximum_nights | 0 | 0.000000 |
| has_availability | 0 | 0.000000 |
| availability_30 | 0 | 0.000000 |
| availability_60 | 0 | 0.000000 |
| price | 0 | 0.000000 |
df_airbnb['property_type'].value_counts()
Entire rental unit 17579
Private room in rental unit 10995
Private room in home 2198
Entire condo 1690
Entire home 1568
...
Private room in religious building 1
Private room in tent 1
Private room in dorm 1
Private room in farm stay 1
Shared room in shepherd's hut 1
Name: property_type, Length: 80, dtype: int64
# For property_type, let's consider top 15 types and put rest under Others.
# List of top 15 values
top15 = df_airbnb['property_type'].value_counts()[:15].index
# if not in top 15 put it under 'Other'
df_airbnb.loc[~df_airbnb['property_type'].isin(top15), 'property_type'] = 'Other'
df_airbnb['property_type'].value_counts()
Entire rental unit 17579 Private room in rental unit 10995 Private room in home 2198 Entire condo 1690 Entire home 1568 Other 1400 Private room in townhouse 1098 Room in hotel 936 Entire loft 760 Entire townhouse 684 Private room in condo 610 Entire serviced apartment 462 Room in boutique hotel 438 Entire guest suite 383 Shared room in rental unit 382 Private room in serviced apartment 350 Name: property_type, dtype: int64
df_airbnb['neighbourhood'].value_counts()
New York, United States 9649
Brooklyn, New York, United States 8997
Queens, New York, United States 3490
Bronx, New York, United States 491
The Bronx, New York, United States 392
...
ozone park queens , New York, United States 1
Jamaica , ny, United States 1
New York, New York , United States 1
Crown Heights,NY, New York, United States 1
Valley Stream, New York, United States 1
Name: neighbourhood, Length: 193, dtype: int64
df_airbnb['neighbourhood_cleansed'].value_counts()
Bedford-Stuyvesant 2936
Williamsburg 2570
Harlem 1949
Midtown 1918
Bushwick 1752
...
Woodrow 1
Bull's Head 1
Westerleigh 1
New Dorp 1
Hollis Hills 1
Name: neighbourhood_cleansed, Length: 223, dtype: int64
Finding skewness of the following attributes.
Data Visualization
from pandas.plotting import scatter_matrix
# Trying to figure out different columns that seems to affect our target variable 'Price'
cols1 = ['price','bathrooms_text', 'bedrooms',
'beds', 'accommodates', 'reviews_per_month']
plt.figure(dpi=500, facecolor = '#dadada')
scatter_matrix(df_airbnb[cols1], alpha=0.4, figsize=(21,17))
plt.savefig(r"figure_1.png")
plt.show()
<Figure size 3000x2000 with 0 Axes>
cols = ['price','longitude', 'latitude', 'bedrooms',
'reviews_per_month', 'neighbourhood']
scatter_matrix(df_airbnb[cols], alpha=0.4, figsize=(21,17))
plt.savefig(r"figure_2.png")
plt.show()
cols = ['longitude', 'latitude', 'price', 'bedrooms', 'beds', 'accommodates']
def finding_skewness():
for col in cols:
print(f'{col} has a skewness of {df_airbnb[col].skew(skipna = True)}')
finding_skewness()
longitude has a skewness of 1.1936961560151862 latitude has a skewness of 0.21193776627988625 price has a skewness of 78.22503185691421 bedrooms has a skewness of 2.6434155698521207 beds has a skewness of 3.971568711806406 accommodates has a skewness of 2.589019820699697
# to set the facecolor
plt.figure(dpi=250, facecolor = '#dadada')
sns.boxplot('price', data=df_airbnb, palette='Blues')
# plt.ylim(0,5000)# Remove the splines
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["bottom"].set_visible(False)
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["left"].set_visible(False)
plt.savefig(r"box2.png")
plt.show()
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-44-e20d2be84cd7> in <module> 2 plt.figure(dpi=250, facecolor = '#dadada') 3 ----> 4 sns.boxplot('price', data=df_airbnb, palette='Blues') 5 # plt.ylim(0,5000)# Remove the splines 6 plt.gca().spines["top"].set_visible(False) TypeError: boxplot() got multiple values for argument 'data'
<Figure size 1500x1000 with 0 Axes>
plt.figure(dpi=250, facecolor = '#dadada')
# by limitting the x axis we are no able to see the box
sns.boxplot('price', data=df_airbnb, palette='Blues')
plt.xlim(0,1000)
# Remove the splines
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["bottom"].set_visible(False)
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["left"].set_visible(False)
plt.savefig(r"box1.png")
plt.show()
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-45-339292bdc3a9> in <module> 1 plt.figure(dpi=250, facecolor = '#dadada') 2 # by limitting the x axis we are no able to see the box ----> 3 sns.boxplot('price', data=df_airbnb, palette='Blues') 4 plt.xlim(0,1000) 5 TypeError: boxplot() got multiple values for argument 'data'
<Figure size 1500x1000 with 0 Axes>
It is very hard ot believe that airbnb prices can be as high as 10000 hence we will be using only prices that are between 20$-1000$
# to get rid of the outliers and to bring down the skewness we will only use price below 1000 and above 24
df_airbnb = df_airbnb.loc[(df_airbnb.price < 1000) & (df_airbnb.price > 24)]
# checking how much did we control the skewness on price
df_airbnb.price.skew()
2.404036150322895
We were able to bring down the skewness of our target variable down to 2.4 from 13.4 and the rest we will take care by applying log later.
def plotting_to_check_skewness():
for col in ['price']:
# to set the facecolor
plt.figure(dpi=500, facecolor = '#dadada')
# setting the limit on the x axis to be able to visualize as we have a big outliers
plt.xlim(0, 700)
sns.distplot(df_airbnb[col], kde=True, bins='auto')
# Remove the splines
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["bottom"].set_visible(False)
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["left"].set_visible(False)
plt.tight_layout() # Makes it better looking specially on laptops
# to save the fig
plt.savefig('skew.png',bbox_inches='tight', dpi=500, facecolor = '#dadada')
plt.show()
plotting_to_check_skewness()
<ipython-input-48-452f7c12176b>:8: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df_airbnb[col], kde=True, bins='auto')
# Resetting the index as we deleted some rows
df_airbnb.reset_index(drop=True, inplace=True)
airbnb_corr = df_airbnb.corr()
airbnb_corr.price.sort_values(ascending=False)
price 1.000000 accommodates 0.464287 bedrooms 0.388907 beds 0.360864 host_listings_count 0.163592 calculated_host_listings_count_entire_homes 0.141445 availability_90 0.139171 availability_60 0.138653 availability_30 0.132365 availability_365 0.124052 review_scores_location 0.115962 id 0.105374 review_scores_cleanliness 0.095617 reviews_per_month 0.087290 host_id 0.084607 last_review 0.075268 calculated_host_listings_count 0.069573 number_of_reviews_ltm 0.058225 review_scores_rating 0.056165 latitude 0.040055 number_of_reviews_l30d 0.036037 review_scores_accuracy 0.016967 host_since 0.013988 first_review 0.013575 review_scores_communication 0.009955 review_scores_checkin 0.003006 maximum_nights -0.001487 review_scores_value -0.004297 number_of_reviews -0.011327 calculated_host_listings_count_shared_rooms -0.048729 calculated_host_listings_count_private_rooms -0.083295 minimum_nights -0.114291 longitude -0.239916 Name: price, dtype: float64
# set heatmap size
plt.figure(figsize= (22,13), dpi=250)
# create heatmap using seaborn
cbar_kws = {"shrink":.8,
'extend':'max',
'extendfrac':.2,
"drawedges":True}
sns.heatmap(airbnb_corr, vmin = -1, vmax = 1, cmap="coolwarm", annot = True, annot_kws={'size': 10}, linewidth = 1, cbar_kws=cbar_kws)
plt.savefig(r'heat.png',bbox_inches='tight', dpi=250, facecolor = '#dadada')
plt.show()
Looking at the heatmap we can eliminate features that highly correlated between each other as they will not add anymore value in the model.
drop_cols4 = ['number_of_reviews_ltm', 'availability_60', 'availability_90', 'calculated_host_listings_count_entire_homes']
df_airbnb.drop(drop_cols4, axis=1, inplace=True)
<ipython-input-52-98f1c63ea659>:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_airbnb.drop(drop_cols4, axis=1, inplace=True)
plt.figure(dpi=250, facecolor = '#dadada')
# histogram of the bedrooms attribute
plt.hist(df_airbnb['bedrooms'])
plt.xlim(0,8)
# Remove the splines
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["bottom"].set_visible(False)
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["left"].set_visible(False)
plt.title('Bedrooms')
plt.savefig(r"box5.png")
plt.show()
# Filling the NaN values using front fill method
df_airbnb['bedrooms'] = df_airbnb['bedrooms'].fillna(method='ffill')
# Made a new column with 4 bins each with good amount of instances
df_airbnb['bedroom'] = pd.cut(df_airbnb['bedrooms'],
bins=[0., 1, 2, np.inf],
labels=[1, 2, 3])
df_airbnb['bedroom'] = df_airbnb['bedroom'].fillna(method='ffill')
<ipython-input-54-368bcc9ff685>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_airbnb['bedrooms'] = df_airbnb['bedrooms'].fillna(method='ffill') <ipython-input-54-368bcc9ff685>:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_airbnb['bedroom'] = pd.cut(df_airbnb['bedrooms'], <ipython-input-54-368bcc9ff685>:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_airbnb['bedroom'] = df_airbnb['bedroom'].fillna(method='ffill')
# histogram plot of our newly created column
plt.figure(dpi=250, facecolor = '#dadada')
# histogram of the bedrooms attribute
plt.hist(df_airbnb['bedroom'])
plt.xlim(0,4)
# Remove the splines
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["bottom"].set_visible(False)
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["left"].set_visible(False)
plt.savefig(r"box7.png")
plt.title('Bedrooms')
plt.show()
# Filling the NaN values using front fill method
df_airbnb['bedrooms'] = df_airbnb['bedrooms'].fillna(method='ffill')
# Made a new column with 4 bins each with good amount of instances
df_airbnb['bedroom'] = pd.cut(df_airbnb['bedrooms'],
bins=[0., 1, 2, np.inf],
labels=[1, 2, 3])
df_airbnb['bedroom'] = df_airbnb['bedroom'].fillna(method='ffill')
<ipython-input-56-368bcc9ff685>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_airbnb['bedrooms'] = df_airbnb['bedrooms'].fillna(method='ffill') <ipython-input-56-368bcc9ff685>:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_airbnb['bedroom'] = pd.cut(df_airbnb['bedrooms'], <ipython-input-56-368bcc9ff685>:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_airbnb['bedroom'] = df_airbnb['bedroom'].fillna(method='ffill')
df_airbnb.drop(0,axis=0,inplace=True)
<ipython-input-57-6d7551621bfc>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_airbnb.drop(0,axis=0,inplace=True)
df_airbnb['bedrooms'].isna().sum()
0
# histogram plot of our newly created column
plt.figure(dpi=250, facecolor = '#dadada')
# histogram of the bedrooms attribute
plt.hist(df_airbnb['bedroom'])
plt.xlim(0,4)
# Remove the splines
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["bottom"].set_visible(False)
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["left"].set_visible(False)
plt.savefig(r"box7.png")
plt.title('Bedrooms')
plt.show()
# Replacing columns with f/t with 0/1
df_airbnb.replace({'f': 0, 't': 1}, inplace=True)
# Plotting the distribution of numerical and boolean categories
df_airbnb.hist(figsize=(20,20));
<ipython-input-60-4ab2edf398b7>:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df_airbnb.replace({'f': 0, 't': 1}, inplace=True)
import warnings
warnings.filterwarnings("ignore")
df_airbnb.last_review = pd.to_datetime(df_airbnb.last_review) # Converting to datetime
# Calculating the number of days between the first review and the date the data was scraped
df_airbnb['time_since_first_review'] = (pd.datetime(2019, 4, 9) - df_airbnb.last_review).astype('timedelta64[D]')
# Distribution of the number of days since first review
df_airbnb.first_review.hist(figsize=(15,5), bins=30);
def bin_column(col, bins, labels, na_label='unknown'):
"""
Takes in a column name, bin cut points and labels, replaces the original column with a
binned version, and replaces nulls (with 'unknown' if unspecified).
"""
df_airbnb[col] = pd.cut(df[col], bins=bins, labels=labels, include_lowest=True)
df_airbnb[col] = df_airbnb[col].astype('str')
df_airbnb[col].fillna(na_label, inplace=True)
# Checking the distributions of the review ratings columns
variables_to_plot = list(df_airbnb.columns[df_airbnb.columns.str.startswith("review_scores") == True])
fig = plt.figure(figsize=(12,8))
for i, var_name in enumerate(variables_to_plot):
ax = fig.add_subplot(3,3,i+1)
df[var_name].hist(bins=10,ax=ax)
ax.set_title(var_name)
fig.tight_layout()
plt.show()
df_airbnb['host_since'] = pd.to_datetime(df['host_since'])
df_airbnb['first_review'] = pd.to_datetime(df['first_review'])
print("Average number of listings per host per year on Airbnb in New York:")
print(round(df_airbnb.set_index('host_since').host_listings_count.resample('YS').mean(),2))
Average number of listings per host per year on Airbnb in New York: host_since 2008-01-01 1.28 2009-01-01 7.74 2010-01-01 28.84 2011-01-01 26.64 2012-01-01 50.76 2013-01-01 54.81 2014-01-01 46.98 2015-01-01 61.10 2016-01-01 77.26 2017-01-01 88.60 2018-01-01 138.41 2019-01-01 81.05 2020-01-01 152.84 2021-01-01 115.33 2022-01-01 125.43 Name: host_listings_count, dtype: float64
# List of the largest host_listings_count and the year the host joined Airbnb
df.sort_values('host_listings_count').drop_duplicates('host_listings_count',keep='last').tail(10)[['host_since', 'host_listings_count']]
| host_since | host_listings_count | |
|---|---|---|
| 10678 | 2015-04-14 | 564.0 |
| 38437 | 2014-12-23 | 574.0 |
| 25495 | 2020-10-26 | 715.0 |
| 38587 | 2022-07-11 | 767.0 |
| 31621 | 2019-04-26 | 1459.0 |
| 20661 | 2015-11-02 | 1519.0 |
| 33444 | 2018-02-22 | 2250.0 |
| 28370 | 2013-03-25 | 2648.0 |
| 15845 | 2016-12-16 | 4559.0 |
| 4953 | NaN | NaN |
df_airbnb.head(10)
| id | source | name | description | neighborhood_overview | host_id | host_name | host_since | host_location | host_about | ... | review_scores_communication | review_scores_location | review_scores_value | instant_bookable | calculated_host_listings_count | calculated_host_listings_count_private_rooms | calculated_host_listings_count_shared_rooms | reviews_per_month | bedroom | time_since_first_review | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 5203 | previous scrape | Cozy Clean Guest Room - Family Apt | Our best guests are seeking a safe, clean, spa... | Our neighborhood is full of restaurants and ca... | 7490 | MaryEllen | 2009-02-05 | New York, NY | Welcome to family life with my oldest two away... | ... | 4.95 | 4.94 | 4.92 | 0 | 1 | 1 | 0 | 0.73 | 1 | 17994.0 |
| 2 | 5136 | city scrape | Spacious Brooklyn Duplex, Patio + Garden | We welcome you to stay in our lovely 2 br dupl... | NaN | 7378 | Rebecca | 2009-02-03 | New York, NY | Rebecca is an artist/designer, and Henoch is i... | ... | 5.00 | 4.67 | 5.00 | 0 | 1 | 0 | 0 | 0.03 | 2 | 17994.0 |
| 3 | 5121 | city scrape | BlissArtsSpace! | One room available for rent in a 2 bedroom apt... | NaN | 7356 | Garon | 2009-02-03 | New York, NY | I am an artist(painter, filmmaker) and curato... | ... | 4.91 | 4.47 | 4.52 | 0 | 2 | 2 | 0 | 0.30 | 1 | 17994.0 |
| 4 | 6848 | city scrape | Only 2 stops to Manhattan studio | Comfortable studio apartment with super comfor... | NaN | 15991 | Allen & Irina | 2009-05-06 | New York, NY | We love to travel. When we travel we like to s... | ... | 4.80 | 4.67 | 4.56 | 0 | 1 | 0 | 0 | 1.13 | 1 | 17994.0 |
| 5 | 5178 | city scrape | Large Furnished Room Near B'way | Please don’t expect the luxury here just a bas... | Theater district, many restaurants around here. | 8967 | Shunichi | 2009-03-03 | New York, NY | I used to work for a financial industry but no... | ... | 4.45 | 4.88 | 4.39 | 0 | 1 | 1 | 0 | 3.38 | 1 | 17994.0 |
| 6 | 6990 | city scrape | UES Beautiful Blue Room | Beautiful peaceful healthy home<br /><br /><b>... | Location: Five minutes to Central Park, Museum... | 16800 | Cyn | 2009-05-12 | New York, NY | Capturing the Steinbeck side of life in its Fi... | ... | 4.95 | 4.84 | 4.85 | 1 | 1 | 1 | 0 | 1.52 | 1 | 17994.0 |
| 7 | 6872 | city scrape | Uptown Sanctuary w/ Private Bath (Month to Month) | A charming month-to-month home away from home ... | This sweet Harlem sanctuary is a 10-20 minute ... | 16104 | Kae | 2009-05-07 | New York, NY | A former life in fashion and wellness has left... | ... | 5.00 | 5.00 | 5.00 | 0 | 2 | 2 | 0 | 0.16 | 1 | 17994.0 |
| 8 | 7097 | city scrape | Perfect for Your Parents: Privacy + Garden | Parents/grandparents coming to town, or just h... | Residential, village-like atmosphere. Lots of ... | 17571 | Jane | 2009-05-17 | New York, NY | I have been an Airbnb host since 2009 -- just ... | ... | 4.92 | 4.94 | 4.81 | 1 | 2 | 1 | 0 | 2.01 | 1 | 17994.0 |
| 9 | 7064 | city scrape | Amazing location! Wburg. Large, bright & tranquil | Large, private loft-like room in a spacious 2-... | - One stop from the East Village, Lower East S... | 17297 | Joelle | 2009-05-15 | New York, NY | I have lived in the same apartment in Brooklyn... | ... | 5.00 | 5.00 | 5.00 | 0 | 2 | 2 | 0 | 0.09 | 1 | 17994.0 |
| 10 | 8490 | city scrape | Maison des Sirenes1,bohemian, luminous apartment | <b>The space</b><br />I am the lucky owner of ... | NaN | 25183 | Nathalie | 2009-07-10 | New York, NY | I am French and have been living in Ny for 10... | ... | 4.87 | 4.65 | 4.75 | 0 | 2 | 0 | 0 | 1.01 | 1 | 17994.0 |
10 rows × 52 columns
df_missing_2 = df_airbnb.isna().sum()
df_missing_2 = df_missing_2.sort_values(ascending = False)
print(df_missing_2.to_markdown())
| | 0 | |:---------------------------------------------|------:| | host_about | 17919 | | neighbourhood | 17050 | | neighborhood_overview | 17050 | | host_response_time | 13429 | | host_response_rate | 13429 | | host_acceptance_rate | 11996 | | review_scores_location | 9308 | | review_scores_value | 9308 | | review_scores_checkin | 9305 | | review_scores_accuracy | 9301 | | review_scores_communication | 9296 | | review_scores_cleanliness | 9291 | | time_since_first_review | 8859 | | review_scores_rating | 8859 | | last_review | 8859 | | reviews_per_month | 8859 | | first_review | 8481 | | host_location | 7398 | | beds | 886 | | description | 746 | | bathrooms_text | 45 | | name | 12 | | host_listings_count | 5 | | host_since | 5 | | host_name | 5 | | number_of_reviews_l30d | 0 | | calculated_host_listings_count_shared_rooms | 0 | | instant_bookable | 0 | | calculated_host_listings_count | 0 | | calculated_host_listings_count_private_rooms | 0 | | calendar_last_scraped | 0 | | bedroom | 0 | | number_of_reviews | 0 | | id | 0 | | availability_365 | 0 | | availability_30 | 0 | | host_id | 0 | | host_is_superhost | 0 | | neighbourhood_cleansed | 0 | | neighbourhood_group_cleansed | 0 | | latitude | 0 | | longitude | 0 | | property_type | 0 | | room_type | 0 | | accommodates | 0 | | bedrooms | 0 | | source | 0 | | price | 0 | | minimum_nights | 0 | | maximum_nights | 0 | | has_availability | 0 | | amenities | 0 |
df_airbnb
| id | source | name | description | neighborhood_overview | host_id | host_name | host_since | host_location | host_about | ... | review_scores_communication | review_scores_location | review_scores_value | instant_bookable | calculated_host_listings_count | calculated_host_listings_count_private_rooms | calculated_host_listings_count_shared_rooms | reviews_per_month | bedroom | time_since_first_review | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 5203 | previous scrape | Cozy Clean Guest Room - Family Apt | Our best guests are seeking a safe, clean, spa... | Our neighborhood is full of restaurants and ca... | 7490 | MaryEllen | 2009-02-05 | New York, NY | Welcome to family life with my oldest two away... | ... | 4.95 | 4.94 | 4.92 | 0 | 1 | 1 | 0 | 0.73 | 1 | 17994.0 |
| 2 | 5136 | city scrape | Spacious Brooklyn Duplex, Patio + Garden | We welcome you to stay in our lovely 2 br dupl... | NaN | 7378 | Rebecca | 2009-02-03 | New York, NY | Rebecca is an artist/designer, and Henoch is i... | ... | 5.00 | 4.67 | 5.00 | 0 | 1 | 0 | 0 | 0.03 | 2 | 17994.0 |
| 3 | 5121 | city scrape | BlissArtsSpace! | One room available for rent in a 2 bedroom apt... | NaN | 7356 | Garon | 2009-02-03 | New York, NY | I am an artist(painter, filmmaker) and curato... | ... | 4.91 | 4.47 | 4.52 | 0 | 2 | 2 | 0 | 0.30 | 1 | 17994.0 |
| 4 | 6848 | city scrape | Only 2 stops to Manhattan studio | Comfortable studio apartment with super comfor... | NaN | 15991 | Allen & Irina | 2009-05-06 | New York, NY | We love to travel. When we travel we like to s... | ... | 4.80 | 4.67 | 4.56 | 0 | 1 | 0 | 0 | 1.13 | 1 | 17994.0 |
| 5 | 5178 | city scrape | Large Furnished Room Near B'way | Please don’t expect the luxury here just a bas... | Theater district, many restaurants around here. | 8967 | Shunichi | 2009-03-03 | New York, NY | I used to work for a financial industry but no... | ... | 4.45 | 4.88 | 4.39 | 0 | 1 | 1 | 0 | 3.38 | 1 | 17994.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 40572 | 772624737396721045 | city scrape | Modern Gem 8 Min to JFK Airport | Kick back & relax in this calm, stylish, cozy ... | NaN | 489891925 | Germanie | 2017-11-19 | NaN | NaN | ... | NaN | NaN | NaN | 1 | 1 | 0 | 0 | NaN | 1 | NaN |
| 40573 | 772683159414917117 | city scrape | Dahiari | Desconecta de tus preocupaciones en este espac... | NaN | 125534010 | Larissa | 2020-08-16 | Dominican Republic | NaN | ... | NaN | NaN | NaN | 0 | 1 | 0 | 0 | NaN | 2 | NaN |
| 40574 | 772705452516314073 | city scrape | Beautiful Basement | Your family will be close to everything when y... | NaN | 338424773 | Md | 2017-11-19 | NaN | NaN | ... | NaN | NaN | NaN | 1 | 1 | 1 | 0 | NaN | 1 | NaN |
| 40575 | 772710779275911753 | city scrape | Central Park Close By - 24 | This is a Three-Bedroom Apartment. You will ha... | NaN | 2653479 | Richard | 2021-11-15 | New York, NY | I love to travel and meet people. | ... | NaN | NaN | NaN | 1 | 37 | 35 | 0 | NaN | 1 | NaN |
| 40576 | 772714221060214808 | city scrape | Good Vibes at The Bronx | Keep it simple at this peaceful and centrally-... | NaN | 421264574 | Aridio | 2022-11-07 | NaN | NaN | ... | NaN | NaN | NaN | 1 | 1 | 0 | 0 | NaN | 2 | NaN |
40576 rows × 52 columns
from sklearn.model_selection import StratifiedShuffleSplit
df_airbnb = df_airbnb.reset_index()
# using Stratified Sampling from Scikit Learn
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df_airbnb, df_airbnb["bedroom"]):
# our stratified train and test set
strat_train_set = df_airbnb.loc[train_index]
strat_test_set = df_airbnb.loc[test_index]
# dropping our new column that we just made for the purpose of stratified splitting
for set in (strat_train_set, strat_test_set):
set.drop(['bedroom'], axis=1, inplace=True)
# Feature matrix
X_train = strat_train_set.drop('price', axis=1)
# Target Variable
y_train = strat_train_set['price']
For evaluation metrics we will be using Mean Absolute Error as it is not affected by outliers unlike Mean Squared Error.
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print('Baseline MAE:', mean_absolute_error(y_train, [y_train.mean()]*len(y_train)))
Baseline MAE: 101.64469801403045
We will be using the following the in the initial pipeline:
We will be using simple imputer with median for numerical attributes as there are outliers
We will be using simple imputer with 'most_frequent' for categorical attributes
We will be using One Hot encoder for encoding all categorical attributes but 'host_response_rate', 'host_acceptance_rate'
we will be using Ordinal encoding for 'host_response_rate', 'host_acceptance_rate' attributes
we will be using standard scaler as linear models and Support Vector Machines work better with scaled values
We will be splitting pipelines into numerical and categorical since we will be using different strategy as mentioned above for each
# lets look at numerical attributes for simple imputer with median
num_attribs = X_train.select_dtypes(exclude='object')
num_attribs.columns
Index(['index', 'id', 'host_id', 'host_since', 'host_is_superhost',
'host_listings_count', 'latitude', 'longitude', 'accommodates',
'bedrooms', 'beds', 'minimum_nights', 'maximum_nights',
'has_availability', 'availability_30', 'availability_365',
'number_of_reviews', 'number_of_reviews_l30d', 'first_review',
'last_review', 'review_scores_rating', 'review_scores_accuracy',
'review_scores_cleanliness', 'review_scores_checkin',
'review_scores_communication', 'review_scores_location',
'review_scores_value', 'instant_bookable',
'calculated_host_listings_count',
'calculated_host_listings_count_private_rooms',
'calculated_host_listings_count_shared_rooms', 'reviews_per_month',
'time_since_first_review'],
dtype='object')
# lets look at categorical attributes for simple imputer with 'most_frequent'
cat_attribs = X_train.select_dtypes(include='object')
cat_attribs.columns
Index(['source', 'name', 'description', 'neighborhood_overview', 'host_name',
'host_location', 'host_about', 'host_response_time',
'host_response_rate', 'host_acceptance_rate', 'neighbourhood',
'neighbourhood_cleansed', 'neighbourhood_group_cleansed',
'property_type', 'room_type', 'bathrooms_text', 'amenities',
'calendar_last_scraped'],
dtype='object')
# making a seperate list for cols that we will one hot encode and the other taht we will ordinal encode
cat_attribs_ohe = cat_attribs.drop(['host_response_rate', 'host_acceptance_rate'], axis=1)
cat_attribs_ordinal = cat_attribs[['host_response_rate', 'host_acceptance_rate']]
cat_attribs_ordinal.columns
Index(['host_response_rate', 'host_acceptance_rate'], dtype='object')
cat_attribs_ohe.columns
Index(['source', 'name', 'description', 'neighborhood_overview', 'host_name',
'host_location', 'host_about', 'host_response_time', 'neighbourhood',
'neighbourhood_cleansed', 'neighbourhood_group_cleansed',
'property_type', 'room_type', 'bathrooms_text', 'amenities',
'calendar_last_scraped'],
dtype='object')
Making a Custom Transformer for Imputing categorical NaN, which will output the result as a dataframe; which we will later pass it on to encoder in the pipeline. I have having an issue of simple imputer spitting out an array, which when passed to ohe for encoding the cat features was throwing an error, as it was expecting a dataframe.
pip install category-encoders
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category-encoders
Downloading category_encoders-2.6.0-py2.py3-none-any.whl (81 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 81.2/81.2 KB 3.9 MB/s eta 0:00:00
Requirement already satisfied: pandas>=1.0.5 in /usr/local/lib/python3.9/dist-packages (from category-encoders) (1.4.4)
Requirement already satisfied: patsy>=0.5.1 in /usr/local/lib/python3.9/dist-packages (from category-encoders) (0.5.3)
Requirement already satisfied: scipy>=1.0.0 in /usr/local/lib/python3.9/dist-packages (from category-encoders) (1.10.1)
Requirement already satisfied: statsmodels>=0.9.0 in /usr/local/lib/python3.9/dist-packages (from category-encoders) (0.13.5)
Requirement already satisfied: numpy>=1.14.0 in /usr/local/lib/python3.9/dist-packages (from category-encoders) (1.22.4)
Requirement already satisfied: scikit-learn>=0.20.0 in /usr/local/lib/python3.9/dist-packages (from category-encoders) (1.2.2)
Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.9/dist-packages (from pandas>=1.0.5->category-encoders) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.9/dist-packages (from pandas>=1.0.5->category-encoders) (2022.7.1)
Requirement already satisfied: six in /usr/local/lib/python3.9/dist-packages (from patsy>=0.5.1->category-encoders) (1.15.0)
Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.9/dist-packages (from scikit-learn>=0.20.0->category-encoders) (1.1.1)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.9/dist-packages (from scikit-learn>=0.20.0->category-encoders) (3.1.0)
Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.9/dist-packages (from statsmodels>=0.9.0->category-encoders) (23.0)
Installing collected packages: category-encoders
Successfully installed category-encoders-2.6.0
# Transformers for the Pipeline
import random
import sklearn
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
import category_encoders as ce
from category_encoders import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
class ImputerDF(BaseEstimator, TransformerMixin):
def __init__(self):
self.imputer = SimpleImputer(strategy='most_frequent')
self.cols = []
def fit(self, X, y=None):
self.imputer.fit(X)
self.cols = list(X.columns)
return self
def transform(self, X):
X_t = self.imputer.transform(X)
return pd.DataFrame(X_t, columns=self.cols)
# making sure it works fine and outputs a dataframe before we apply it in the pipeline.
idf = ImputerDF()
idf.fit_transform(X_train.iloc[:5])
| index | id | source | name | description | neighborhood_overview | host_id | host_name | host_since | host_location | ... | review_scores_checkin | review_scores_communication | review_scores_location | review_scores_value | instant_bookable | calculated_host_listings_count | calculated_host_listings_count_private_rooms | calculated_host_listings_count_shared_rooms | reviews_per_month | time_since_first_review | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8817 | 14472671 | city scrape | Unique and Cozy 1 bedroom Apt in quiet Pelham Bay | Quiet and modern one bedroom apt. with fully e... | You can find peace and quiet in this neighborh... | 89188080 | Honey | 2013-11-18 | New York, United States | ... | 4.97 | 4.93 | 4.79 | 4.93 | 0 | 1 | 0 | 0 | 3.09 | 17994.0 |
| 1 | 24484 | 48316162 | city scrape | Sonder Flatiron | Accessible Queen Room | Start your New York City experience from the h... | Centered by one of New York City's most iconic... | 219517861 | Sonder (NYC) | 2019-08-11 | New York, NY | ... | 4.3 | 4.7 | 4.6 | 4.1 | 0 | 76 | 42 | 0 | 0.54 | 17994.0 |
| 2 | 36783 | 710074059868704221 | city scrape | Sunny & Bright Private Room in Flatbush, Brooklyn | Welcome to our home!<br />Take a break and unw... | Located in a serene and treelined section of V... | 478199990 | Diana | 2021-04-12 | New York, NY | ... | 5.0 | 4.86 | 4.86 | 5.0 | 1 | 3 | 3 | 0 | 2.96 | 17994.0 |
| 3 | 22727 | 45672204 | previous scrape | **FULLY FURNISHED 1 BEDROOM APARTMENT NYC** | Fully furnished 1 bedroom apartment in the hea... | Centered by one of New York City's most iconic... | 198716390 | Vlad | 2019-08-11 | New York, NY | ... | 4.0 | 5.0 | 4.33 | 4.67 | 0 | 1 | 0 | 0 | 0.12 | 17994.0 |
| 4 | 7703 | 12656785 | previous scrape | Loft @ Williamsburg Bedford | Typical Brooklyn industrial Loft.<br />Apartme... | Right in the heart of real Williamsburg! Pictu... | 19912320 | Charles | 2016-04-13 | New York, NY | ... | 4.67 | 5.0 | 5.0 | 4.67 | 0 | 2 | 2 | 0 | 0.08 | 17994.0 |
5 rows × 51 columns
# Using median as the strategy for Simple Imputer to predict NaN values considering the ouliers
num_pipeline = make_pipeline(
SimpleImputer(strategy='median'),
StandardScaler()
)
# Added the Custom Transformer to pass on a dataframe to Ordinal Encoder
# Ordinal Encoder from cetgorical encoders library doesn't work well with np.array
cat_pipeline = make_pipeline(
ImputerDF(),
OrdinalEncoder(cols = cat_attribs)
)
Using Column transformer we will be putting two pipelines together
# generating a list of categorical and numerical columns to pass it in the column transformer
cat_attributes = list(cat_attribs)
num_attributes = list(num_attribs)
pipeline = ColumnTransformer([
('num_pipeline', num_pipeline, num_attributes),
('cat_attribs', cat_pipeline, cat_attributes)
])
# we will fit and transform on X_train
X_train_transformed = pipeline.fit_transform(X_train)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-89-fb8a487a6939> in <module> 9 10 # we will fit and transform on X_train ---> 11 X_train_transformed = pipeline.fit_transform(X_train) /usr/local/lib/python3.9/dist-packages/sklearn/utils/_set_output.py in wrapped(self, X, *args, **kwargs) 138 @wraps(f) 139 def wrapped(self, X, *args, **kwargs): --> 140 data_to_wrap = f(self, X, *args, **kwargs) 141 if isinstance(data_to_wrap, tuple): 142 # only wrap the first output for cross decomposition /usr/local/lib/python3.9/dist-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y) 725 self._validate_remainder(X) 726 --> 727 result = self._fit_transform(X, y, _fit_transform_one) 728 729 if not result: /usr/local/lib/python3.9/dist-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted, column_as_strings) 656 ) 657 try: --> 658 return Parallel(n_jobs=self.n_jobs)( 659 delayed(func)( 660 transformer=clone(trans) if not fitted else trans, /usr/local/lib/python3.9/dist-packages/sklearn/utils/parallel.py in __call__(self, iterable) 61 for delayed_func, args, kwargs in iterable 62 ) ---> 63 return super().__call__(iterable_with_config) 64 65 /usr/local/lib/python3.9/dist-packages/joblib/parallel.py in __call__(self, iterable) 1046 # remaining jobs. 1047 self._iterating = False -> 1048 if self.dispatch_one_batch(iterator): 1049 self._iterating = self._original_iterator is not None 1050 /usr/local/lib/python3.9/dist-packages/joblib/parallel.py in dispatch_one_batch(self, iterator) 862 return False 863 else: --> 864 self._dispatch(tasks) 865 return True 866 /usr/local/lib/python3.9/dist-packages/joblib/parallel.py in _dispatch(self, batch) 780 with self._lock: 781 job_idx = len(self._jobs) --> 782 job = self._backend.apply_async(batch, callback=cb) 783 # A job can complete so quickly than its callback is 784 # called before we get here, causing self._jobs to /usr/local/lib/python3.9/dist-packages/joblib/_parallel_backends.py in apply_async(self, func, callback) 206 def apply_async(self, func, callback=None): 207 """Schedule a func to be run""" --> 208 result = ImmediateResult(func) 209 if callback: 210 callback(result) /usr/local/lib/python3.9/dist-packages/joblib/_parallel_backends.py in __init__(self, batch) 570 # Don't delay the application, to avoid keeping the input 571 # arguments in memory --> 572 self.results = batch() 573 574 def get(self): /usr/local/lib/python3.9/dist-packages/joblib/parallel.py in __call__(self) 261 # change the default number of processes to -1 262 with parallel_backend(self._backend, n_jobs=self._n_jobs): --> 263 return [func(*args, **kwargs) 264 for func, args, kwargs in self.items] 265 /usr/local/lib/python3.9/dist-packages/joblib/parallel.py in <listcomp>(.0) 261 # change the default number of processes to -1 262 with parallel_backend(self._backend, n_jobs=self._n_jobs): --> 263 return [func(*args, **kwargs) 264 for func, args, kwargs in self.items] 265 /usr/local/lib/python3.9/dist-packages/sklearn/utils/parallel.py in __call__(self, *args, **kwargs) 121 config = {} 122 with config_context(**config): --> 123 return self.function(*args, **kwargs) /usr/local/lib/python3.9/dist-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params) 891 with _print_elapsed_time(message_clsname, message): 892 if hasattr(transformer, "fit_transform"): --> 893 res = transformer.fit_transform(X, y, **fit_params) 894 else: 895 res = transformer.fit(X, y, **fit_params).transform(X) /usr/local/lib/python3.9/dist-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params) 435 """ 436 fit_params_steps = self._check_fit_params(**fit_params) --> 437 Xt = self._fit(X, y, **fit_params_steps) 438 439 last_step = self._final_estimator /usr/local/lib/python3.9/dist-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps) 357 cloned_transformer = clone(transformer) 358 # Fit or load from cache the current transformer --> 359 X, fitted_transformer = fit_transform_one_cached( 360 cloned_transformer, 361 X, /usr/local/lib/python3.9/dist-packages/joblib/memory.py in __call__(self, *args, **kwargs) 347 348 def __call__(self, *args, **kwargs): --> 349 return self.func(*args, **kwargs) 350 351 def call_and_shelve(self, *args, **kwargs): /usr/local/lib/python3.9/dist-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params) 891 with _print_elapsed_time(message_clsname, message): 892 if hasattr(transformer, "fit_transform"): --> 893 res = transformer.fit_transform(X, y, **fit_params) 894 else: 895 res = transformer.fit(X, y, **fit_params).transform(X) /usr/local/lib/python3.9/dist-packages/sklearn/utils/_set_output.py in wrapped(self, X, *args, **kwargs) 138 @wraps(f) 139 def wrapped(self, X, *args, **kwargs): --> 140 data_to_wrap = f(self, X, *args, **kwargs) 141 if isinstance(data_to_wrap, tuple): 142 # only wrap the first output for cross decomposition /usr/local/lib/python3.9/dist-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params) 876 if y is None: 877 # fit method of arity 1 (unsupervised transformation) --> 878 return self.fit(X, **fit_params).transform(X) 879 else: 880 # fit method of arity 2 (supervised transformation) /usr/local/lib/python3.9/dist-packages/sklearn/impute/_base.py in fit(self, X, y) 388 ) 389 --> 390 X = self._validate_input(X, in_fit=True) 391 392 # default fill_value is 0 for numerical input and "missing_value" /usr/local/lib/python3.9/dist-packages/sklearn/impute/_base.py in _validate_input(self, X, in_fit) 325 326 try: --> 327 X = self._validate_data( 328 X, 329 reset=in_fit, /usr/local/lib/python3.9/dist-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params) 563 raise ValueError("Validation should be done on X, y or both.") 564 elif not no_val_X and no_val_y: --> 565 X = check_array(X, input_name="X", **check_params) 566 out = X 567 elif no_val_X and not no_val_y: /usr/local/lib/python3.9/dist-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name) 776 ) 777 if all(isinstance(dtype_iter, np.dtype) for dtype_iter in dtypes_orig): --> 778 dtype_orig = np.result_type(*dtypes_orig) 779 780 elif hasattr(array, "iloc") and hasattr(array, "dtype"): /usr/local/lib/python3.9/dist-packages/numpy/core/overrides.py in result_type(*args, **kwargs) TypeError: The DType <class 'numpy.dtype[datetime64]'> could not be promoted by <class 'numpy.dtype[float64]'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtype[int64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[datetime64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[datetime64]'>, <class 'numpy.dtype[datetime64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[int64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>)
df_airbnb.head()
| index | id | source | name | description | neighborhood_overview | host_id | host_name | host_since | host_location | ... | review_scores_communication | review_scores_location | review_scores_value | instant_bookable | calculated_host_listings_count | calculated_host_listings_count_private_rooms | calculated_host_listings_count_shared_rooms | reviews_per_month | bedroom | time_since_first_review | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 5203 | previous scrape | Cozy Clean Guest Room - Family Apt | Our best guests are seeking a safe, clean, spa... | Our neighborhood is full of restaurants and ca... | 7490 | MaryEllen | 2009-02-05 | New York, NY | ... | 4.95 | 4.94 | 4.92 | 0 | 1 | 1 | 0 | 0.73 | 1 | 17994.0 |
| 1 | 2 | 5136 | city scrape | Spacious Brooklyn Duplex, Patio + Garden | We welcome you to stay in our lovely 2 br dupl... | NaN | 7378 | Rebecca | 2009-02-03 | New York, NY | ... | 5.00 | 4.67 | 5.00 | 0 | 1 | 0 | 0 | 0.03 | 2 | 17994.0 |
| 2 | 3 | 5121 | city scrape | BlissArtsSpace! | One room available for rent in a 2 bedroom apt... | NaN | 7356 | Garon | 2009-02-03 | New York, NY | ... | 4.91 | 4.47 | 4.52 | 0 | 2 | 2 | 0 | 0.30 | 1 | 17994.0 |
| 3 | 4 | 6848 | city scrape | Only 2 stops to Manhattan studio | Comfortable studio apartment with super comfor... | NaN | 15991 | Allen & Irina | 2009-05-06 | New York, NY | ... | 4.80 | 4.67 | 4.56 | 0 | 1 | 0 | 0 | 1.13 | 1 | 17994.0 |
| 4 | 5 | 5178 | city scrape | Large Furnished Room Near B'way | Please don’t expect the luxury here just a bas... | Theater district, many restaurants around here. | 8967 | Shunichi | 2009-03-03 | New York, NY | ... | 4.45 | 4.88 | 4.39 | 0 | 1 | 1 | 0 | 3.38 | 1 | 17994.0 |
5 rows × 53 columns
from google.colab import files
df_airbnb.to_csv('NYC_Listings_filtered.csv')
files.download('NYC_Listings_filtered.csv')